The goal of this notebook is to generate roseplots of the cytokine panels measured from plasma and CSF samples.
The roseplots for the marginalized data appear in Figure 4 A.
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
## first, grabbing molecule names from the original files (this is a bit dirty...)
df =pd.read_csv('data/Inflammation_Plasma_Etter.csv',sep=';', skiprows=[0,1,2,4,5,6] , index_col=0)
df.drop(columns=['Plate ID' , 'QC Warning'] , inplace=True)
inflammatoryMolecules = list( df.columns )
df =pd.read_csv('data/Neurology_Plasma_Etter.csv',sep=';', skiprows=[0,1,2,4,5,6] , index_col=0)
df.drop(columns=['Plate ID' , 'QC Warning'] , inplace=True)
neuroMolecules = list( df.columns )
## removing common molecules. we keep the ones in neuro
inter = set( inflammatoryMolecules ).intersection(neuroMolecules)
for c in inter:
inflammatoryMolecules.remove(c)
print(len(inflammatoryMolecules))
print(len(neuroMolecules))
## now reading the molecular data
df_molecular = pd.read_csv( "imputed_data/ALL_imputed.csv" , index_col='Assay' )
df_molecular.head()
## and metadata
df_metadata = pd.read_csv('data/metadata.csv', index_col='studyID')
df_metadata.covid = df_metadata.covid.astype(bool)
df_metadata.Age = df_metadata.Age.astype(int)
df_metadata.head()
df_molecular_tmp = df_molecular.copy()
df_molecular_tmp["patient"] = df_molecular_tmp.index
df_molecular_long = df_molecular_tmp.melt( id_vars=['patient'])# ,ignore_index=False)
df_molecular_long.patient.value_counts()
df_molecular_long.head()
df_molecular_long['Group'] = df_molecular_long.patient.replace( df_metadata.Group )
Let's see what we get when we just plot all this along a single axis
fig,ax = plt.subplots(figsize=(14,8))
sns.lineplot(x='variable' , y='value' , data= df_molecular_long , hue='Group' , ax=ax)
OK, let's reduce this to plot a single panel
PImolec = list( map( lambda x : "Plasma_"+x , inflammatoryMolecules ) )
PNmolec = list( map( lambda x : "Plasma_"+x , neuroMolecules ) )
CImolec = list( map( lambda x : "CSF_"+x , inflammatoryMolecules ) )
CNmolec = list( map( lambda x : "CSF_"+x , neuroMolecules ) )
m = df_molecular_long.variable.isin( PImolec )
fig,ax = plt.subplots(figsize=(14,8))
sns.lineplot(x='variable' , y='value' , data= df_molecular_long.loc[m,:] , hue='Group' , ax=ax)
Clearly, the high dynamic range of the data will cause problem.
Let's switch to Z-scores
def getZscore( c ):
return ( c - c.mean() ) / c.std(ddof=0)
df_molecular_Z = df_molecular.apply(getZscore)
print("means")
print( df_molecular_Z.mean().describe() )
print("\nstds")
print( df_molecular_Z.std(ddof=0).describe() )
df_molecular_tmp = df_molecular_Z.copy()
df_molecular_tmp["patient"] = df_molecular_tmp.index
df_molecular_Z_long = df_molecular_tmp.melt( id_vars=['patient'])# ,ignore_index=False)
df_molecular_Z_long['Group'] = df_molecular_Z_long.patient.replace( df_metadata.Group )
df_molecular_Z_long.head()
m = df_molecular_Z_long.variable.isin( PImolec )
fig,ax = plt.subplots(figsize=(14,8))
sns.lineplot(x='variable' , y='value' , data= df_molecular_Z_long.loc[m,:] , hue='Group' , ax=ax)
This is quite better!
Potential improvement :
But first, we want to switch to a polar projection:
def makeRosePlot( df , groups, orderedMol , colorD={'ICU COVID' : '#a6611a', 'Non-ICU COVID' : '#dfc27d', 'healthy control' : '#80cdc1','inflammatory control' : '#018571' }, linewidth= 1.0 , alpha = 0.2 ):
angles = np.linspace( 0, 2*np.pi * (1-1/len( orderedMol )) , len( orderedMol ) )
fig = plt.figure(figsize = (14,14))
ax = fig.add_subplot(projection='polar')
for g in groups:
#print(g)
M = df.loc[ g , orderedMol , : ].value
S = np.array(df.loc[ g , orderedMol , : ]["std"])
C = np.array(df.loc[ g , orderedMol , : ]["count"])
SEM = S / np.sqrt(C)
#print(len(M),len(S),len(C))
c = ax.plot(angles, M, c= colorD[g] , label=g , linewidth =linewidth )
ax.fill_between( angles , M+SEM , M-SEM , color =colorD[g], alpha=alpha )
## handling labels
ax.set_xticks(angles)
Xlabels = []
for a,l in zip( angles , orderedMol ) :
m = l.partition('_')[2]
if a <= np.pi*0.5 or a > np.pi*1.5:
m = m.ljust(20)
else:
m = m.rjust(20)
Xlabels.append(m)
ax.set_xticklabels( Xlabels ,
fontfamily='monospace')#, rotation='vertical')
angleLabels = np.linspace(0,2*np.pi,len(ax.get_xticklabels())+1)
angleLabels[np.cos(angleLabels) < 0] = angleLabels[np.cos(angleLabels) < 0] + np.pi
angleLabels = np.rad2deg(angleLabels)
labels=[]
for label, angle in zip(ax.get_xticklabels(), angleLabels):
x,y = label.get_position()
lblTransform = label.get_text()
lab = ax.text(x,y-0.13, lblTransform, transform=label.get_transform(),
ha=label.get_ha(),va=label.get_va(), fontfamily='monospace')
lab.set_rotation(angle)
labels.append(lab)
ax.set_xticklabels([])
ax.legend(loc='upper right',bbox_to_anchor=(1.05, 1.05))
return fig,ax
def makeFlatRosePlot( df , groups, orderedMol , colorD={'ICU COVID' : '#a6611a', 'Non-ICU COVID' : '#dfc27d', 'healthy control' : '#80cdc1','inflammatory control' : '#018571' } ):
angles = np.linspace( 0, 2*np.pi * (1-1/len( orderedMol )) , len( orderedMol ) )
fig = plt.figure(figsize = (14,14))
ax = fig.add_subplot()
for g in groups:
#print(g)
M = df.loc[ g , orderedMol , : ].value
S = np.array(df.loc[ g , orderedMol , : ]["std"])
C = np.array(df.loc[ g , orderedMol , : ]["count"])
SEM = S / np.sqrt(C)
#print(len(M),len(S),len(C))
c = ax.plot(angles, M, c= colorD[g] , label=g )
ax.fill_between( angles , M+SEM , M-SEM , color =colorD[g], alpha=0.2 )
## handling labels
ax.set_xticks(angles)
Xlabels = []
for a,l in zip( angles , orderedMol ) :
m = l.partition('_')[2]
#if a <= np.pi*0.5 or a > np.pi*1.5:
# m = m.ljust(20)
#else:
# m = m.rjust(20)
Xlabels.append(m)
ax.set_xticklabels( Xlabels ,
fontfamily='monospace', rotation='vertical')
ax.legend(loc='upper right',bbox_to_anchor=(1.05, 1.05))
return fig,ax
df_metadata["stageGroup"] = df_metadata.Group
df_metadata.loc[df_metadata.covid==True , "stageGroup"] = df_metadata.Stage[df_metadata.covid == True]
df_metadata.head()
pd.crosstab( df_metadata.Group , df_metadata.stageGroup )
df_molecular_tmp = df_molecular_Z.copy()
df_molecular_tmp["patient"] = df_molecular_tmp.index
df_molecular_Z_long = df_molecular_tmp.melt( id_vars=['patient'])# ,ignore_index=False)
df_molecular_Z_long['stageGroup'] = df_molecular_Z_long.patient.replace( df_metadata.stageGroup )
df_molecular_Z_long.head()
m = df_molecular_Z_long.variable.isin( PImolec )
fig,ax = plt.subplots(figsize=(14,8))
sns.lineplot(x='variable' , y='value' , data= df_molecular_Z_long.loc[m,:] , hue='stageGroup' , ax=ax)
This is quite better!
Potential improvement : find a clever way to order columns. (most discriminative ? increasing ICU COVID mean?)
x = "variable"
y= "value"
hue='stageGroup'
GB = df_molecular_Z_long.groupby( by=[hue,x] )
test = pd.DataFrame( GB[y].mean() )
test['std'] = GB[y].std()
test['count'] = GB[y].count()
test
groups = df_molecular_Z_long[hue].unique()
## fix the order
groups = ['healthy control','inflammatory control','I','II','III']
ListMolec = PImolec
refG = 'III'
M = list( test.loc[ refG , ListMolec , : ].value )
O = np.argsort(M)
orderedMol = [ PImolec[o] for o in O]
orderedMol += list( set( PImolec ).difference(orderedMol) )
# purple / green
#lut = {'III' :'#762a83',
# 'II' : '#af8dc3',
# 'I' : '#e7d4e8',
# 'healthy control' : '#a6dba0',
# 'inflammatory control' : '#008837' }
# red/yellow / turquoise / blues
lut = {'III' :'#073B4C',
'II' : '#118AB2',
'I' : '#06D6A0',
'healthy control' : '#FFCE5C',
'inflammatory control' : '#F26989' }
ListMolec = PNmolec
refG = 'III'
M = list( test.loc[ refG , ListMolec , : ].value )
O = np.argsort(M)
orderedMol = [ ListMolec[o] for o in O]
orderedMol += list( set( ListMolec ).difference(orderedMol) )
fig,ax = makeRosePlot( df=test , groups=groups, orderedMol=orderedMol , colorD=lut )
ax.set_title("Plasma Neurology panel\nZ-scores - SEM error bars", loc = 'left')
#fig.savefig("images/rose_Plasma_Neurology_neuroCOVID.colors3.png")
# red/yellow / turquoise / blues
lut = {'III' :'#073B4C',
'II' : '#118AB2',
'I' : '#06D6A0',
'healthy control' : '#FFCE5C',
'inflammatory control' : '#F26989' }
ListMolecs = [PImolec,PNmolec,CImolec,CNmolec]
sources = ['Plasma','Plasma','CSF','CSF']
panels = ['Inflammatory','Neurology','Inflammatory','Neurology']
for i in range(len(ListMolecs)):
ListMolec = ListMolecs[i]
source = sources[i]
panel = panels[i]
refG = 'III'
M = list( test.loc[ refG , ListMolec , : ].value )
O = np.argsort(M)
orderedMol = [ ListMolec[o] for o in O]
orderedMol += list( set( ListMolec ).difference(orderedMol) )
fig,ax = makeRosePlot( df=test , groups=groups, orderedMol=orderedMol , colorD=lut )
ax.set_title(source +" "+panel+" panel\nZ-scores - SEM error bars", loc = 'left')
fig.savefig("images/rosePlots/rose_"+source+"_"+panel+"_neuroCOVID"+".pdf")
## now reading the molecular data
df_molecular = pd.read_csv( "marginalized_data/data_marginalized_neuroCOVID.csv" , index_col=0 )
df_molecular.drop(columns="stageGroup" , inplace=True)
df_molecular.head()
def getZscore( c ):
return ( c - c.mean() ) / c.std(ddof=0)
df_molecular_Z = df_molecular.apply(getZscore)
print("means")
print( df_molecular_Z.mean().describe() )
print("\nstds")
print( df_molecular_Z.std(ddof=0).describe() )
df_molecular_tmp = df_molecular_Z.copy()
df_molecular_tmp["patient"] = df_molecular_tmp.index
df_molecular_Z_long = df_molecular_tmp.melt( id_vars=['patient'])# ,ignore_index=False)
df_molecular_Z_long['stageGroup'] = df_molecular_Z_long.patient.replace( df_metadata.stageGroup )
df_molecular_Z_long.head()
m = df_molecular_Z_long.variable.isin( PImolec )
x = "variable"
y= "value"
hue='stageGroup'
GB = df_molecular_Z_long.groupby( by=[hue,x] )
test = pd.DataFrame( GB[y].mean() )
test['std'] = GB[y].std()
test['count'] = GB[y].count()
test
groups = df_molecular_Z_long[hue].unique()
## fix the order
groups = ['healthy control','inflammatory control','I','II','III']
ListMolec = PImolec
refG = 'III'
M = list( test.loc[ refG , ListMolec , : ].value )
O = np.argsort(M)
orderedMol = [ PImolec[o] for o in O]
orderedMol += list( set( PImolec ).difference(orderedMol) )
# red/yellow / turquoise / blues
lut = {'III' :'#073B4C',
'II' : '#118AB2',
'I' : '#06D6A0',
'healthy control' : '#FFCE5C',
'inflammatory control' : '#F26989' }
# we replace special characters that were removed duting marginalization
PImolec = ["Plasma_" + x.replace('-','_').replace(' ','_') for x in inflammatoryMolecules]
CImolec = ["CSF_" + x.replace('-','_').replace(' ','_') for x in inflammatoryMolecules]
CNmolec = ["CSF_" + x.replace('-','_').replace(' ','_') for x in neuroMolecules]
PNmolec = ["Plasma_" + x.replace('-','_').replace(' ','_') for x in neuroMolecules]
ListMolecs = [PImolec,PNmolec,CImolec,CNmolec]
sources = ['Plasma','Plasma','CSF','CSF']
panels = ['Inflammatory','Neurology','Inflammatory','Neurology']
for i in range(len(ListMolecs)):
ListMolec = ListMolecs[i]
source = sources[i]
panel = panels[i]
refG = 'III'
M = list( test.loc[ refG , ListMolec , : ].value )
O = np.argsort(M)
orderedMol = [ ListMolec[o] for o in O]
orderedMol += list( set( ListMolec ).difference(orderedMol) )
fig,ax = makeRosePlot( df=test , groups=groups, orderedMol=orderedMol , colorD=lut )
ax.set_title(source +" "+panel+" panel\nZ-scores - SEM error bars", loc = 'left')
fig.savefig("images/rosePlots_marginal/rose_marginal_"+source+"_"+panel+"_neuroCOVID"+".pdf")